cmake_minimum_required(VERSION 3.5.1)
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)

if (POLICY CMP0074)
  cmake_policy(SET CMP0074 NEW) # CMake 3.12
endif ()

project(marian CXX C)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(BUILD_ARCH native CACHE STRING "Compile for this CPU architecture.")

# Custom CMake options
option(COMPILE_CPU "Compile CPU version" ON)
option(COMPILE_CUDA "Compile GPU version" ON)
option(COMPILE_EXAMPLES "Compile examples" OFF)
option(COMPILE_SERVER "Compile marian-server" OFF)
option(COMPILE_TESTS "Compile tests" OFF)
if(APPLE)
  option(USE_APPLE_ACCELERATE "Compile with Apple Accelerate" ON)
else(APPLE)
  option(USE_APPLE_ACCELERATE "Compile with Apple Accelerate" OFF)
endif(APPLE)
option(USE_CCACHE "Use ccache compiler cache (https://ccache.dev)" OFF)
option(USE_CUDNN "Use CUDNN library" OFF)
option(USE_DOXYGEN "Build documentation with Doxygen" ON)
option(USE_FBGEMM "Use FBGEMM" OFF)
option(USE_MKL "Compile with MKL support" ON)
option(USE_MPI "Use MPI library" OFF)
option(USE_NCCL "Use NCCL library" ON)
option(USE_SENTENCEPIECE "Download and compile SentencePiece" ON)
option(USE_STATIC_LIBS "Link statically against non-system libs" OFF)
option(GENERATE_MARIAN_INSTALL_TARGETS "Generate Marian install targets (requires CMake 3.12+)" OFF)
option(DETERMINISTIC "Try to make training results as deterministic as possible (e.g. for testing)" OFF)

# fbgemm and sentencepiece are both defined with "non-local" installation targets (the source projects don't define them,
# so we define them in src\3rd_party\CMakeLists.txt), but that isn't supported until CMake 3.12. Prior to CMake 3.12,
# targets could only be install(...)ed in the same CMakeLists.txt they were defined. We currently target CMake 3.5.1
# as our minimum supported CMake version, so this option exists to provide compatibility by disabling install targets.
if(GENERATE_MARIAN_INSTALL_TARGETS AND ${CMAKE_VERSION} VERSION_LESS "3.12")
  message(WARNING "Marian install targets cannot be generated on CMake <3.12.\
    Please upgrade your CMake version or set GENERATE_MARIAN_INSTALL_TARGETS=OFF to remove this warning. Disabling installation targets.")
  set(GENERATE_MARIAN_INSTALL_TARGETS OFF CACHE BOOL "Forcing disabled installation targets due to CMake <3.12." FORCE)
endif()

if(GENERATE_MARIAN_INSTALL_TARGETS)
  include(GNUInstallDirs)                 # This defines default values for installation directories (all platforms even if named GNU)
  include(InstallRequiredSystemLibraries) # Tell CMake that the `install` target needs to install required system libraries (eg: Windows SDK)
  include(CMakePackageConfigHelpers)      # Helper to create relocatable packages

  install(EXPORT marian-targets           # Installation target
    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake)
endif(GENERATE_MARIAN_INSTALL_TARGETS)

# use ccache (https://ccache.dev) for faster compilation if requested and available
if(USE_CCACHE)
  find_program(CCACHE_PROGRAM ccache)
  if(CCACHE_PROGRAM)
    message(STATUS "Will be using ccache for faster repeat compilation (use cmake -DUSE_CCACHE=off to disable).")
    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CCACHE_PROGRAM}")
  else(CCACHE_PROGRAM)
    message(WARNING "Compilation with ccache requested but no ccache found.")
  endif(CCACHE_PROGRAM)
endif(USE_CCACHE)

# Project versioning
find_package(Git QUIET)
include(GetVersionFromFile)

message(STATUS "Project name: ${PROJECT_NAME}")
message(STATUS "Project version: ${PROJECT_VERSION_STRING_FULL}")

execute_process(COMMAND git submodule update --init --recursive --no-fetch
                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})

# Note that with CMake MSVC build, the option CMAKE_BUILD_TYPE is automatically derived from the key
# 'configurationType' in CMakeSettings.json configurations
if(NOT CMAKE_BUILD_TYPE)
  message(WARNING "CMAKE_BUILD_TYPE not set; setting to Release")
  set(CMAKE_BUILD_TYPE "Release")
endif()

###############################################################################
# Set compilation flags
if(MSVC)
# These are used in src/CMakeLists.txt on a per-target basis
  list(APPEND ALL_WARNINGS /WX; /W4;)

  # Disabled bogus warnings for CPU intrinsics and Protobuf:
  # C4100: 'identifier' : unreferenced formal parameter
  # C4310: cast truncates constant value
  # C4324: 'marian::cpu::int16::`anonymous-namespace'::ScatterPut': structure was padded due to alignment specifier
  # C4702: unreachable code; note it is also disabled globally in the VS project file
  # C4996: warning STL4015: The std::iterator class template (used as a base class to provide typedefs) is deprecated in C++17
  if(USE_SENTENCEPIECE)
    set(DISABLE_GLOBALLY "/wd\"4310\" /wd\"4324\" /wd\"4702\" /wd\"4996\" /wd\"4100\"")
  else()
    set(DISABLE_GLOBALLY "/wd\"4310\" /wd\"4324\" /wd\"4702\" /wd\"4996\"")
  endif()

  # set(INTRINSICS "/arch:AVX")
  add_definitions(-DUSE_SSE2=1)

  # Or maybe use these?
  set(INTRINSICS "/arch:AVX2")
  # set(INTRINSICS "/arch:AVX512")
  # /bigobj is necessary for expression_operators.cpp. See https://stackoverflow.com/questions/15110580/penalty-of-the-msvs-compiler-flag-bigobj
  set(CMAKE_CXX_FLAGS           "/EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS /bigobj ${DISABLE_GLOBALLY}")
  set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS} /MT /O2 ${INTRINSICS} /Zi /MP /GL /DNDEBUG")
  set(CMAKE_CXX_FLAGS_DEBUG     "${CMAKE_CXX_FLAGS} /MTd /Od /Ob0 ${INTRINSICS} /RTC1 /Zi /D_DEBUG")

  # ignores warning LNK4049: locally defined symbol free imported - this comes from zlib
  set(CMAKE_EXE_LINKER_FLAGS         "${CMAKE_EXE_LINKER_FLAGS} /DEBUG /LTCG:incremental /INCREMENTAL:NO /ignore:4049")
  set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS} /NODEFAULTLIB:MSVCRT")
  set(CMAKE_EXE_LINKER_FLAGS_DEBUG   "${CMAKE_EXE_LINKER_FLAGS} /NODEFAULTLIB:MSVCRTD")
  set(CMAKE_STATIC_LINKER_FLAGS      "${CMAKE_STATIC_LINKER_FLAGS} /LTCG:incremental")

  find_library(SHLWAPI Shlwapi.lib)
  set(EXT_LIBS ${EXT_LIBS} SHLWAPI)

  if(USE_FBGEMM)
    if(NOT USE_STATIC_LIBS) # FBGEMM on Windows can be compiled only statically via CMake
      message(FATAL_ERROR "FATAL ERROR: FBGEMM must be compiled statically on Windows, \
      add -DUSE_STATIC_LIBS=on to the cmake command")
    endif()
    set(EXT_LIBS ${EXT_LIBS} fbgemm)
    add_definitions(-DUSE_FBGEMM=1 -DFBGEMM_STATIC=1)
  endif(USE_FBGEMM)
else(MSVC)

  # Check we are using at least g++ 5.0
  if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
    message(FATAL_ERROR "FATAL ERROR: Compiling Marian requires at least g++ 5.0, your version is ${CMAKE_CXX_COMPILER_VERSION}")
  endif()

  # Detect support CPU instrinsics for the current platform. This will
  # only by used with BUILD_ARCH=native. For overridden BUILD_ARCH we
  # force intrinsics as set in the options.
  set(INTRINSICS "")
  list(APPEND INTRINSICS_NVCC)

  option(COMPILE_SSE2   "Compile CPU code with SSE2 support"   ON)
  option(COMPILE_SSE3   "Compile CPU code with SSE3 support"   ON)
  option(COMPILE_SSE4_1 "Compile CPU code with SSE4.1 support" ON)
  option(COMPILE_SSE4_2 "Compile CPU code with SSE4.2 support" ON)
  option(COMPILE_AVX    "Compile CPU code with AVX support"    ON)
  option(COMPILE_AVX2   "Compile CPU code with AVX2 support"   ON)
  option(COMPILE_AVX512 "Compile CPU code with AVX512 support" ON)

  if(BUILD_ARCH STREQUAL "native")
    message(STATUS "Building with -march=native and intrinsics will be chosen automatically by the compiler to match the current machine.")
    message(STATUS "Checking support for CPU intrinsics")
    include(FindSSE)
    if(SSE2_FOUND AND NOT COMPILE_SSE2)
      message(WARNING "SSE2 enabled due to -march=native and -DCOMPILE_SSE2=${COMPILE_SSE2} is ignored.")
    endif(SSE2_FOUND AND NOT COMPILE_SSE2)
    if(SSE3_FOUND AND NOT COMPILE_SSE3)
      message(WARNING "SSE3 enabled due to -march=native and -DCOMPILE_SSE3=${COMPILE_SSE3} is ignored.")
    endif(SSE3_FOUND AND NOT COMPILE_SSE3)
    if(SSE4_1_FOUND AND NOT COMPILE_SSE4_1)
      message(WARNING "SSE4.1 enabled due to -march=native and -DCOMPILE_SSE4_1=${COMPILE_SSE4_1} is ignored.")
    endif(SSE4_1_FOUND AND NOT COMPILE_SSE4_1)
    if(SSE4_2_FOUND AND NOT COMPILE_SSE4_2)
      message(WARNING "SSE4.2 enabled due to -march=native and -DCOMPILE_SSE4_2=${COMPILE_SSE4_2} is ignored.")
    endif(SSE4_2_FOUND AND NOT COMPILE_SSE4_2)
    if(AVX_FOUND AND NOT COMPILE_AVX)
      message(WARNING "AVX enabled due to -march=native and -DCOMPILE_AVX=${COMPILE_AVX} is ignored.")
    endif(AVX_FOUND AND NOT COMPILE_AVX)
    if(AVX2_FOUND AND NOT COMPILE_AVX2)
      message(WARNING "AVX2 enabled due to -march=native and -DCOMPILE_AVX2=${COMPILE_AVX2} is ignored.")
    endif(AVX2_FOUND AND NOT COMPILE_AVX2)
    if(AVX512_FOUND AND NOT COMPILE_AVX512)
      message(WARNING "AVX512 enabled due to -march=native and -DCOMPILE_AVX512=${COMPILE_AVX512} is ignored.")
    endif(AVX512_FOUND AND NOT COMPILE_AVX512)
  else()
    # force to build with the requested intrisics, requires compiler support
    message(STATUS "Building with -march=${BUILD_ARCH} and forcing intrisics as requested")
    if(COMPILE_SSE2)
      message(STATUS "SSE2 support requested")
      set(INTRINSICS "${INTRINSICS} -msse2")
      list(APPEND INTRINSICS_NVCC -Xcompiler\ -msse2)
    endif(COMPILE_SSE2)
    if(COMPILE_SSE3)
      message(STATUS "SSE3 support requested")
      set(INTRINSICS "${INTRINSICS} -msse3")
      list(APPEND INTRINSICS_NVCC -Xcompiler\ -msse3)
    endif(COMPILE_SSE3)
    if(COMPILE_SSE4_1)
      message(STATUS "SSE4.1 support requested")
      set(INTRINSICS "${INTRINSICS} -msse4.1")
      list(APPEND INTRINSICS_NVCC -Xcompiler\ -msse4.1)
    endif(COMPILE_SSE4_1)
    if(COMPILE_SSE4_2)
      message(STATUS "SSE4.2 support requested")
      set(INTRINSICS "${INTRINSICS} -msse4.2")
      list(APPEND INTRINSICS_NVCC -Xcompiler\ -msse4.2)
    endif(COMPILE_SSE4_2)
    if(COMPILE_AVX)
      message(STATUS "AVX support requested")
      set(INTRINSICS "${INTRINSICS} -mavx")
      list(APPEND INTRINSICS_NVCC -Xcompiler\ -mavx)
    endif(COMPILE_AVX)
    if(COMPILE_AVX2)
      message(STATUS "AVX2 support requested")
      set(INTRINSICS "${INTRINSICS} -mavx2")
      list(APPEND INTRINSICS_NVCC -Xcompiler\ -mavx2)
    endif(COMPILE_AVX2)
    if(COMPILE_AVX512)
      message(STATUS "AVX512 support requested")
      set(INTRINSICS "${INTRINSICS} -mavx512f")
      list(APPEND INTRINSICS_NVCC -Xcompiler\ -mavx512f)
    endif(COMPILE_AVX512)
  endif()

  if(USE_FBGEMM)
    set(EXT_LIBS ${EXT_LIBS} fbgemm dl)
    add_definitions(-DUSE_FBGEMM=1)
  endif(USE_FBGEMM)

  if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0)
    # Clang-10.0.0 complains when CUDA is newer than 10.1
    set(CLANG_IGNORE_UNKNOWN_CUDA "-Wno-unknown-warning-option -Wno-unknown-cuda-version")
  endif()
  set(DISABLE_GLOBALLY "-Wno-unused-result ${CLANG_IGNORE_UNKNOWN_CUDA}")

  # These are used in src/CMakeLists.txt on a per-target basis
  list(APPEND ALL_WARNINGS -Wall; -Werror; -Wextra; -Wno-unused-result; -Wno-deprecated;
    -Wno-pragmas; -Wno-unused-parameter; -Wno-unused-function;
    -Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare;
    -Wno-missing-field-initializers;)

  # This warning does not exist prior to gcc 5.0
  if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
    list(APPEND ALL_WARNINGS -Wsuggest-override -Wno-int-in-bool-context)
  endif()

  if(CMAKE_COMPILER_IS_GNUCC)
    # these flags are not known to clang
    set(CMAKE_GCC_FLAGS "-Wl,--no-as-needed")
    set(CMAKE_RDYNAMIC_FLAG "-rdynamic")
  endif(CMAKE_COMPILER_IS_GNUCC)

  set(CMAKE_CXX_FLAGS                 "-std=c++11 -pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
  set(CMAKE_CXX_FLAGS_RELEASE         "-O3 -m64 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
  set(CMAKE_CXX_FLAGS_DEBUG           "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
  set(CMAKE_CXX_FLAGS_SLIM            "-O3 -m64 -funroll-loops -DNDEBUG")
  set(CMAKE_CXX_FLAGS_RELWITHDEBINFO  "${CMAKE_CXX_FLAGS_RELEASE}")
  set(CMAKE_CXX_FLAGS_PROFILE         "${CMAKE_CXX_FLAGS_RELEASE} -pg")
  set(CMAKE_CXX_FLAGS_PROFGEN         "${CMAKE_CXX_FLAGS_RELEASE} -fprofile-generate -fprofile-correction")
  set(CMAKE_CXX_FLAGS_PROFUSE         "${CMAKE_CXX_FLAGS_RELEASE} -fprofile-use -fprofile-correction")

  # these need to be set separately
  set(CMAKE_C_FLAGS                 "-pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
  set(CMAKE_C_FLAGS_RELEASE         "-O3 -m64 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
  set(CMAKE_C_FLAGS_DEBUG           "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
  set(CMAKE_C_FLAGS_SLIM            "-O3 -m64 -funroll-loops -DNDEBUG")
  set(CMAKE_C_FLAGS_RELWITHDEBINFO  "${CMAKE_C_FLAGS_RELEASE}")
  set(CMAKE_C_FLAGS_PROFILE         "${CMAKE_C_FLAGS_RELEASE} -pg")
  set(CMAKE_C_FLAGS_PROFGEN         "${CMAKE_C_FLAGS_RELEASE} -fprofile-generate -fprofile-correction")
  set(CMAKE_C_FLAGS_PROFUSE         "${CMAKE_C_FLAGS_RELEASE} -fprofile-use -fprofile-correction")
endif(MSVC)

# with gcc 7.0 and above we need to mark fallthrough in switch case statements
# that can be done in comments for backcompat, but CCACHE removes comments.
# -C makes gcc keep comments.
if(USE_CCACHE)
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -C")
endif()

###############################################################################
# Downloading SentencePiece if requested and set to compile with it.
# Requires all the dependencies imposed by SentencePiece
if(USE_SENTENCEPIECE)
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_SENTENCEPIECE")
  LIST(APPEND CUDA_NVCC_FLAGS -DUSE_SENTENCEPIECE; )
  set(EXT_LIBS ${EXT_LIBS} sentencepiece sentencepiece_train)
endif()

if(USE_ONNX)
  message(STATUS "Enabling experimental ONNX support")
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_ONNX")
  # TODO: likely required to find protobuf by itself, we should check/fix this. Before it would take advantage of sentencepiece doing that.
  set(EXT_LIBS ${EXT_LIBS} protobuf)
  include_directories(${Protobuf_INCLUDE_DIRS})
endif()

# Find packages
set(EXT_LIBS ${EXT_LIBS} ${CMAKE_DL_LIBS})

###############################################################################
if(COMPILE_CUDA)

if(USE_STATIC_LIBS)
  # link statically to stdlib libraries
  if(NOT MSVC)
    set(CMAKE_EXE_LINKER_FLAGS "-static-libgcc -static-libstdc++")
  endif()

  # look for libraries that have .a suffix
  set(_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
  if(WIN32)
    list(INSERT CMAKE_FIND_LIBRARY_SUFFIXES 0 .lib .a)
  else()
    set(CMAKE_FIND_LIBRARY_SUFFIXES .a _static.a)
  endif()
endif()

find_package(CUDA "9.0") # TODO: only enable FP16-related options for compute_70 and higher.
if(CUDA_FOUND)
  # CUDA >= 10.0 requires CMake >= 3.12.2
  if((CUDA_VERSION VERSION_EQUAL "10.0" OR CUDA_VERSION VERSION_GREATER "10.0") AND (CMAKE_VERSION VERSION_LESS "3.12.2"))
      message(WARNING "On some Unix systems CUDA 10.0+ requires CMake 3.12.2+; you use CMake ${CMAKE_VERSION}")
  endif()

  # We want to compile as many targets as possible but different CUDA versions support different targets.
  # Let's instead enable options based on what cuda version we have.
  if((CUDA_VERSION VERSION_EQUAL "9.0" OR CUDA_VERSION VERSION_GREATER "9.0") AND CUDA_VERSION VERSION_LESS "11.0")
    option(COMPILE_KEPLER  "Compile GPU version with SM35 support" OFF)
    option(COMPILE_MAXWELL "Compile GPU version with SM50 support" OFF)
    option(COMPILE_PASCAL  "Compile GPU version with SM60 support" ON)
    option(COMPILE_VOLTA   "Compile GPU version with SM70 support" ON)
  endif()
  if((CUDA_VERSION VERSION_EQUAL "10.0" OR CUDA_VERSION VERSION_GREATER "10.0") AND CUDA_VERSION VERSION_LESS "11.0")
    option(COMPILE_KEPLER  "Compile GPU version with SM35 support" OFF)
    option(COMPILE_MAXWELL "Compile GPU version with SM50 support" OFF)
    option(COMPILE_PASCAL  "Compile GPU version with SM60 support" ON)
    option(COMPILE_VOLTA   "Compile GPU version with SM70 support" ON)
    option(COMPILE_TURING  "Compile GPU version with SM75 support" ON)
  endif()
  if(CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0")
    option(COMPILE_KEPLER  "Compile GPU version with SM35 support" OFF) # deprecated for CUDA 11
    option(COMPILE_MAXWELL "Compile GPU version with SM50 support" OFF) # deprecated for CUDA 11
    option(COMPILE_PASCAL  "Compile GPU version with SM60 support" ON)
    option(COMPILE_VOLTA   "Compile GPU version with SM70 support" ON)
    option(COMPILE_TURING  "Compile GPU version with SM75 support" ON)
    option(COMPILE_AMPERE  "Compile GPU version with SM80 support" ON)
    LIST(APPEND COMPUTE -Wno-deprecated-gpu-targets)
  endif()
  if(CUDA_VERSION VERSION_EQUAL "11.1" OR CUDA_VERSION VERSION_GREATER "11.1")
    option(COMPILE_KEPLER      "Compile GPU version with SM35 support" OFF) # deprecated for CUDA 11
    option(COMPILE_MAXWELL     "Compile GPU version with SM50 support" OFF) # deprecated for CUDA 11
    option(COMPILE_PASCAL      "Compile GPU version with SM60 support" ON)
    option(COMPILE_VOLTA       "Compile GPU version with SM70 support" ON)
    option(COMPILE_TURING      "Compile GPU version with SM75 support" ON)
    option(COMPILE_AMPERE      "Compile GPU version with SM80 support" ON)
    option(COMPILE_AMPERE_RTX  "Compile GPU version with SM86 support" ON)
    LIST(APPEND COMPUTE -Wno-deprecated-gpu-targets)
  endif()

  if(COMPILE_KEPLER)
    message(STATUS "Compiling code for Kepler GPUs")
    LIST(APPEND COMPUTE -gencode=arch=compute_35,code=sm_35;) # Tesla K40 and above
  endif(COMPILE_KEPLER)
  if(COMPILE_MAXWELL)
    message(STATUS "Compiling code for Maxwell GPUs")
    LIST(APPEND COMPUTE -gencode=arch=compute_50,code=sm_50; -gencode=arch=compute_52,code=sm_52;)     # Maxwell GPUs
  endif(COMPILE_MAXWELL)
  if(COMPILE_PASCAL)
    message(STATUS "Compiling code for Pascal GPUs")
    LIST(APPEND COMPUTE -gencode=arch=compute_60,code=sm_60; -gencode=arch=compute_61,code=sm_61;)     # Pascal GPUs
  endif(COMPILE_PASCAL)
  if(COMPILE_VOLTA)
    message(STATUS "Compiling code for Volta GPUs")
    LIST(APPEND COMPUTE -arch=sm_70; -gencode=arch=compute_70,code=sm_70; -gencode=arch=compute_70,code=compute_70) # Volta GPUs
  endif(COMPILE_VOLTA)
  if(CUDA_VERSION VERSION_EQUAL "10.0" OR CUDA_VERSION VERSION_GREATER "10.0")
    if(COMPILE_TURING)
        message(STATUS "Compiling code for Turing GPUs")
        LIST(APPEND COMPUTE -gencode=arch=compute_75,code=sm_75; -gencode=arch=compute_75,code=compute_75) # Turing GPUs
    endif(COMPILE_TURING)
  endif()
  if(CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0")
    if(COMPILE_AMPERE)
        message(STATUS "Compiling code for Ampere GPUs")
        LIST(APPEND COMPUTE -gencode=arch=compute_80,code=sm_80; -gencode=arch=compute_80,code=compute_80) # Ampere GPUs
    endif(COMPILE_AMPERE)
  endif()
  if(CUDA_VERSION VERSION_EQUAL "11.1" OR CUDA_VERSION VERSION_GREATER "11.1")
    if(COMPILE_AMPERE_RTX)
        message(STATUS "Compiling code for Ampere RTX GPUs")
        LIST(APPEND COMPUTE -gencode=arch=compute_86,code=sm_86; -gencode=arch=compute_86,code=compute_86) # Ampere RTX GPUs
    endif(COMPILE_AMPERE_RTX)
  endif()

  if(USE_STATIC_LIBS)
    set(EXT_LIBS ${EXT_LIBS} ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusparse_LIBRARY})
    set(CUDA_LIBS ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusparse_LIBRARY})

    find_library(CUDA_culibos_LIBRARY NAMES culibos PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
    # The cuLIBOS library does not seem to exist in Windows CUDA toolkit installs
    if(CUDA_culibos_LIBRARY)
      set(EXT_LIBS ${EXT_LIBS} ${CUDA_culibos_LIBRARY})
      set(CUDA_LIBS ${CUDA_LIBS} ${CUDA_culibos_LIBRARY})
    elseif(NOT WIN32)
      message(FATAL_ERROR "cuLIBOS library not found")
    endif()
    # CUDA 10.1 introduces cublasLt library that is required on static build
    if ((CUDA_VERSION VERSION_EQUAL "10.1" OR CUDA_VERSION VERSION_GREATER "10.1"))
      find_library(CUDA_cublasLt_LIBRARY NAMES cublasLt PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
      if(NOT CUDA_cublasLt_LIBRARY)
        message(FATAL_ERROR "cuBLASLt library not found")
      endif()
      set(EXT_LIBS ${EXT_LIBS} ${CUDA_cublasLt_LIBRARY})
      set(CUDA_LIBS ${CUDA_LIBS} ${CUDA_cublasLt_LIBRARY})
    endif()
    message(STATUS "Found CUDA libraries: ${CUDA_LIBS}")
  else(USE_STATIC_LIBS)
  set(CUDA_LIBS ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
    # We actually only need cublasLt here after cuda 11. Marian will work fine without it pre cuda 11. We want to force CMake to use the cublas
    # version that ships with CUDA 11 so we force the search to occur inside of the cuda toolkit directory.
    set(CUDA_LIBS ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
    if ((CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0"))
      find_library(CUDA_cublasLt_LIBRARY NAMES cublasLt PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 NO_DEFAULT_PATH)
      if(NOT CUDA_cublasLt_LIBRARY)
        message(FATAL_ERROR "cuBLASLt library not found")
      endif()
      set(EXT_LIBS ${EXT_LIBS} ${CUDA_cublasLt_LIBRARY})
      set(CUDA_LIBS ${CUDA_LIBS} ${CUDA_cublasLt_LIBRARY})
    endif()
    set(EXT_LIBS ${EXT_LIBS} ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
    message(STATUS "Found CUDA libraries: ${CUDA_LIBS}")
  endif(USE_STATIC_LIBS)

  if(USE_CUDNN)
    find_package(CUDNN "7.0")
    if(CUDNN_FOUND)
      include_directories(${CUDNN_INCLUDE_DIRS})
      set(EXT_LIBS ${EXT_LIBS} ${CUDNN_LIBRARIES})
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCUDNN")
      LIST(APPEND CUDA_NVCC_FLAGS -DCUDNN; )
    endif(CUDNN_FOUND)
  endif(USE_CUDNN)

  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCUDA_FOUND")
  list(APPEND CUDA_NVCC_FLAGS -DCUDA_FOUND; )

  if(MSVC)
    list(APPEND CUDA_NVCC_FLAGS -DBOOST_PP_VARIADICS=0; )
  endif()

  if(USE_NCCL)
    add_library(nccl STATIC IMPORTED)
    set(EXT_LIBS ${EXT_LIBS} nccl)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_NCCL")
    LIST(APPEND CUDA_NVCC_FLAGS -DUSE_NCCL; )
  endif(USE_NCCL)

  if(USE_STATIC_LIBS)
    set(CMAKE_FIND_LIBRARY_SUFFIXES ${_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES})
  endif()

else(CUDA_FOUND)
  message("
Cannot find suitable CUDA libraries. Specify the path explicitly with
  -DCUDA_TOOLKIT_ROOT_DIR=/path/to/appropriate/cuda/installation
   (hint: try /usr/local/$(readlink /usr/local/cuda))
OR compile the CPU-only version of Marian with
  -DCOMPILE_CUDA=off
")
  message(FATAL_ERROR "FATAL ERROR: No suitable CUDA library found.")
endif(CUDA_FOUND)

else(COMPILE_CUDA)
  message(WARNING "COMPILE_CUDA=off : Building only CPU version")
endif(COMPILE_CUDA)

# TODO: make compatible with older CUDA versions
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
  list(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -O0; -g; --use_fast_math; ${COMPUTE})
else(CMAKE_BUILD_TYPE STREQUAL "Debug")
  list(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -O3; -g; --use_fast_math; ${COMPUTE})
endif(CMAKE_BUILD_TYPE STREQUAL "Debug")
if(NOT MSVC)
  # @TODO: add warnings here too
  list(APPEND CUDA_NVCC_FLAGS -ccbin ${CMAKE_C_COMPILER}; -std=c++11; -Xcompiler\ -fPIC; -Xcompiler\ -Wno-unused-result; -Xcompiler\ -Wno-deprecated; -Xcompiler\ -Wno-pragmas; -Xcompiler\ -Wno-unused-value; -Xcompiler\ -Werror;)
  list(APPEND CUDA_NVCC_FLAGS ${INTRINSICS_NVCC})
else()
  list(APPEND CUDA_NVCC_FLAGS -Xcompiler\ /FS; -Xcompiler\ /MT$<$<CONFIG:Debug>:d>; )
endif()

list(REMOVE_DUPLICATES CUDA_NVCC_FLAGS)
set(CUDA_PROPAGATE_HOST_FLAGS OFF)

if(USE_STATIC_LIBS)
  set(_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
  if(WIN32)
    list(INSERT CMAKE_FIND_LIBRARY_SUFFIXES 0 .lib .a)
  else()
    set(CMAKE_FIND_LIBRARY_SUFFIXES .a)
  endif()
endif()

###############################################################################
# Find Tcmalloc_minimal 
# re-used from sentencepiece
if(NOT WIN32)
  if(USE_STATIC_LIBS)
    find_library(TCMALLOC_LIB NAMES libtcmalloc_minimal.a)
  else()
    find_library(TCMALLOC_LIB NAMES tcmalloc_minimal)
  endif()
  if (TCMALLOC_LIB)
    message(STATUS "Found TCMalloc: ${TCMALLOC_LIB}")
    set(EXT_LIBS ${EXT_LIBS} ${Tcmalloc_LIBRARIES})
    add_definitions(-fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free)
  else()
    message(STATUS "Not Found TCMalloc: ${TCMALLOC_LIB}")
  endif()
endif()

###############################################################################
# Find BLAS library
if(COMPILE_CPU)
  if(NOT GENERATE_MARIAN_INSTALL_TARGETS)
    set(EXT_LIBS ${EXT_LIBS} intgemm) # Enable intgemm when compiling CPU
    add_definitions(-DCOMPILE_CPU=1)
  endif()
  if(USE_APPLE_ACCELERATE)
    if(NOT APPLE)
      message(FATAL_ERROR "FATAL ERROR: Apple Accelerate only works on macOS.")
    endif()
    set(BLAS_VENDOR "Accelerate")
    # see https://developer.apple.com/documentation/accelerate for more info
    # you may need to install Xcode command line tools if you don't have them already (https://developer.apple.com/xcode/features/)
    include_directories("/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/System/Library/Frameworks/Accelerate.framework/Frameworks/vecLib.framework/Headers")
    set(EXT_LIBS ${EXT_LIBS} "-framework Accelerate")
    add_definitions(-DBLAS_FOUND=1)
  else(USE_APPLE_ACCELERATE)
    if(USE_MKL)
      find_package(MKL)
    endif(USE_MKL)
    if(MKL_FOUND)
      include_directories(${MKL_INCLUDE_DIR})
      set(EXT_LIBS ${EXT_LIBS} ${MKL_LIBRARIES})
      set(BLAS_FOUND TRUE)
      add_definitions(-DBLAS_FOUND=1 -DMKL_FOUND=1)
    else(MKL_FOUND)
      set(BLAS_VENDOR "OpenBLAS")
      find_package(BLAS)
      if(BLAS_FOUND)
        include(FindCBLAS)
        if(CBLAS_FOUND)
          include_directories(${BLAS_INCLUDE_DIR} ${CBLAS_INCLUDE_DIR})
          set(EXT_LIBS ${EXT_LIBS} ${BLAS_LIBRARIES} ${CBLAS_LIBRARIES})
          add_definitions(-DBLAS_FOUND=1)
        endif(CBLAS_FOUND)
      endif(BLAS_FOUND)
    endif(MKL_FOUND)
  endif(USE_APPLE_ACCELERATE)
endif(COMPILE_CPU)

###############################################################################
# Find OpenSSL
set(BOOST_COMPONENTS "")
if(COMPILE_SERVER)
  find_package(OpenSSL)
  if(OpenSSL_FOUND)
    message(STATUS "Found OpenSSL")
    include_directories(${OPENSSL_INCLUDE_DIR})
    set(EXT_LIBS ${EXT_LIBS} ${OPENSSL_CRYPTO_LIBRARY})
    if(MSVC AND USE_STATIC_LIBS)
      # "If you link with static OpenSSL libraries then you're expected to additionally link your
      # application with WS2_32.LIB, GDI32.LIB, ADVAPI32.LIB, CRYPT32.LIB and USER32.LIB"
      # See https://github.com/openssl/openssl/blob/OpenSSL_1_1_1d/NOTES.WIN#L127
      # Linking with crypt32.lib seem to be enough.
      set(EXT_LIBS ${EXT_LIBS} crypt32.lib)
    endif()
    set(BOOST_COMPONENTS ${BOOST_COMPONENTS} system)
  else(OpenSSL_FOUND)
    message(WARNING "Cannot find OpenSSL library. Not compiling server.")
    set(COMPILE_SERVER "off")
  endif(OpenSSL_FOUND)
endif(COMPILE_SERVER)

###############################################################################
# Undo static lib search and put non-static searches here:

if(USE_STATIC_LIBS)
  set(CMAKE_FIND_LIBRARY_SUFFIXES ${_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES})
endif()

if(DETERMINISTIC)
  message(WARNING "Option DETERMINISTIC=ON: Trying to make training as deterministic as possible, may result in slow-down")
  add_definitions(-DDETERMINISTIC=1)
  list(APPEND CUDA_NVCC_FLAGS -DDETERMINISTIC=1; )
else()
  add_definitions(-DDETERMINISTIC=0)
  list(APPEND CUDA_NVCC_FLAGS -DDETERMINISTIC=0; )
endif()

# Find MPI
if(USE_MPI)
  # 2.0 refers to MPI2 standard. OpenMPI is an implementation of that standard regardless of the specific OpenMPI version
  # e.g. OpenMPI 1.10 implements MPI2 and will be found correctly.
  find_package(MPI 2.0 REQUIRED)
  if(MPI_FOUND)
    include_directories(${MPI_INCLUDE_PATH})
    set(EXT_LIBS ${EXT_LIBS} ${MPI_LIBRARIES})
    if(USE_STATIC_LIBS) # alternatively this could install OpenMPI like NCCL and link against that statically with greater control
    message(WARNING "MPI implementations are notoriously difficult to link statically, linking ${MPI_LIBRARIES} dynamically despite -DUSE_STATIC_LIBS=on")
    endif(USE_STATIC_LIBS)
    add_definitions(-DMPI_FOUND=1)
  endif(MPI_FOUND)
endif(USE_MPI)


###############################################################################
# Find Boost if required
if(BOOST_COMPONENTS)
  if(USE_STATIC_LIBS)
    set(Boost_USE_STATIC_LIBS ON)
  endif()

  find_package(Boost COMPONENTS ${BOOST_COMPONENTS})
  if(Boost_FOUND)
    include_directories(${Boost_INCLUDE_DIRS})
    set(EXT_LIBS ${EXT_LIBS} ${Boost_LIBRARIES})
    set(EXT_LIBS ${EXT_LIBS} ${ZLIB_LIBRARIES}) # hack for static compilation
    if(MSVC)
      add_definitions(-DBOOST_ALL_NO_LIB=1) # hack for missing date-time stub
    endif()
  else(Boost_FOUND)
    message(SEND_ERROR "Cannot find Boost libraries. Terminating.")
  endif(Boost_FOUND)
endif(BOOST_COMPONENTS)

###############################################################################
if(COMPILE_TESTS)
  enable_testing()
endif(COMPILE_TESTS)

if(COMPILE_EXAMPLES)
  add_definitions(-DCOMPILE_EXAMPLES=1)
endif(COMPILE_EXAMPLES)

# Generate project_version.h to reflect our version number
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/src/common/project_version.h.in
               ${CMAKE_CURRENT_SOURCE_DIR}/src/common/project_version.h @ONLY)

# Generate build_info.cpp with CMake cache variables
include(GetCacheVariables)

# make sure src/common/build_info.cpp has been removed
execute_process(COMMAND rm ${CMAKE_CURRENT_SOURCE_DIR}/src/common/build_info.cpp
                OUTPUT_QUIET ERROR_QUIET)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/src/common/build_info.cpp.in
               ${CMAKE_CURRENT_BINARY_DIR}/src/common/build_info.cpp @ONLY)
# to be able to check if this is a CMake-based compilation, which always adds
# build-info option, even on Windows.
add_definitions(-DBUILD_INFO_AVAILABLE=1)

# Compile source files
include_directories(${marian_SOURCE_DIR}/src)
add_subdirectory(src)

###############################################################################
if(USE_DOXYGEN)
# Add a target to generate API documentation with Doxygen
find_package(Doxygen)
if(DOXYGEN_FOUND)
  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in
           ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile @ONLY)
  add_custom_target(doc
    ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
   COMMENT "Generating API documentation with Doxygen" VERBATIM
  )
endif(DOXYGEN_FOUND)
endif(USE_DOXYGEN)
